In [11]:
import pandas as pd

In [4]:
import boto

In [5]:
c = boto.connect_s3()

In [6]:
b = c.get_bucket('pivotal-london-dis')

In [7]:
list_of_files = b.list()

In [8]:
list_of_files


Out[8]:
<boto.s3.bucketlistresultset.BucketListResultSet at 0x107702550>

In [9]:
for l in list_of_files:
    print(l) 
    break


<Key: pivotal-london-dis,tfl_api_line_mode_status_tube_2015-02-24_11:51:45.json>

In [12]:
pd.read_json(l)


Out[12]:
$type created disruptedNaptanIds id lineStatuses modeName modified name routeSections
0 Tfl.Api.Presentation.Entities.Line, Tfl.Api.Pr... 2015-02-23T13:10:57.307 [940GZZLUBST, 940GZZLUCHX, 940GZZLUEAC, 940GZZ... bakerloo [{'statusSeverityDescription': 'Good Service',... tube 2015-02-23 13:10:57.307 Bakerloo []
1 Tfl.Api.Presentation.Entities.Line, Tfl.Api.Pr... 2015-02-23T13:10:57.587 [940GZZLUBKE, 940GZZLUBLG, 940GZZLUBND, 940GZZ... central [{'statusSeverityDescription': 'Good Service',... tube 2015-02-23 13:10:57.587 Central []
2 Tfl.Api.Presentation.Entities.Line, Tfl.Api.Pr... 2015-02-23T13:10:57.18 [940GZZLUALD, 940GZZLUBBN, 940GZZLUBKF, 940GZZ... circle [{'statusSeverityDescription': 'Good Service',... tube 2015-02-23 13:10:57.180 Circle []
3 Tfl.Api.Presentation.Entities.Line, Tfl.Api.Pr... 2015-02-23T13:10:57.21 [910GBLFR, 940GZZLUADE, 940GZZLUBBB, 940GZZLUB... district [{'statusSeverityDescription': 'Good Service',... tube 2015-02-23 13:10:57.210 District []
4 Tfl.Api.Presentation.Entities.Line, Tfl.Api.Pr... 2015-02-23T13:10:57.413 [940GZZLUADE, 940GZZLUBBB, 940GZZLUBBN, 940GZZ... hammersmith-city [{'statusSeverityDescription': 'Good Service',... tube 2015-02-23 13:10:57.413 Hammersmith & City []
5 Tfl.Api.Presentation.Entities.Line, Tfl.Api.Pr... 2015-02-23T13:10:57.96 [940GZZLUBMY, 940GZZLUBND, 940GZZLUBST, 940GZZ... jubilee [{'statusSeverityDescription': 'Good Service',... tube 2015-02-23 13:10:57.960 Jubilee []
6 Tfl.Api.Presentation.Entities.Line, Tfl.Api.Pr... 2015-02-23T13:10:56.947 [940GZZLUALD, 940GZZLUBBN, 940GZZLUBST, 940GZZ... metropolitan [{'statusSeverityDescription': 'Good Service',... tube 2015-02-23 13:10:56.947 Metropolitan []
7 Tfl.Api.Presentation.Entities.Line, Tfl.Api.Pr... 2015-02-23T13:10:57.837 [940GZZLUAGL, 940GZZLUBLM, 940GZZLUBNK, 940GZZ... northern [{'statusSeverityDescription': 'Good Service',... tube 2015-02-23 13:10:57.837 Northern []
8 Tfl.Api.Presentation.Entities.Line, Tfl.Api.Pr... 2015-02-23T13:10:57.727 [940GZZLUACT, 940GZZLUALP, 940GZZLUASG, 940GZZ... piccadilly [{'statusSeverityDescription': 'Good Service',... tube 2015-02-23 13:10:57.727 Piccadilly []
9 Tfl.Api.Presentation.Entities.Line, Tfl.Api.Pr... 2015-02-23T13:10:57.93 [940GZZLUBLR, 940GZZLUBXN, 940GZZLUEUS, 940GZZ... victoria [{'statusSeverityDescription': 'Good Service',... tube 2015-02-23 13:10:57.930 Victoria []
10 Tfl.Api.Presentation.Entities.Line, Tfl.Api.Pr... 2015-02-23T13:10:57.617 [] waterloo-city [{'statusSeverityDescription': 'Good Service',... tube 2015-02-23 13:10:57.617 Waterloo & City []

In [66]:
l2 = b.list(prefix='tfl_api_line_mode_status_tube_2015-02-24_11:')

In [67]:
list(l2)


Out[67]:
[<Key: pivotal-london-dis,tfl_api_line_mode_status_tube_2015-02-24_11:51:45.json>,
 <Key: pivotal-london-dis,tfl_api_line_mode_status_tube_2015-02-24_11:52:44.json>,
 <Key: pivotal-london-dis,tfl_api_line_mode_status_tube_2015-02-24_11:53:44.json>,
 <Key: pivotal-london-dis,tfl_api_line_mode_status_tube_2015-02-24_11:54:45.json>,
 <Key: pivotal-london-dis,tfl_api_line_mode_status_tube_2015-02-24_11:55:44.json>,
 <Key: pivotal-london-dis,tfl_api_line_mode_status_tube_2015-02-24_11:56:45.json>]

In [31]:
import sys
sys.path.append('..')

In [77]:
from dis_ds import parsing

In [89]:
import imp
imp.reload(parsing)


Out[89]:
<module 'dis_ds.parsing' from '../dis_ds/parsing.py'>

In [38]:
l.key


Out[38]:
'tfl_api_line_mode_status_tube_2015-02-24_11:51:45.json'

In [54]:
list(b.list(prefix='tfl_api_line_mode_status_tube_2015-02-24_11:51:45.json'))[0]


Out[54]:
<Key: pivotal-london-dis,tfl_api_line_mode_status_tube_2015-02-24_11:51:45.json>

In [61]:
getattr(l, 'key', 'myfile')


Out[61]:
'tfl_api_line_mode_status_tube_2015-02-24_11:51:45.json'

In [62]:
l.key


Out[62]:
'tfl_api_line_mode_status_tube_2015-02-24_11:51:45.json'

In [65]:
getattr(l, 'key2', 'mydefault')


Out[65]:
'mydefault'

In [86]:
newdf = parsing.parse_file_list(list(b.list(prefix='tfl_api_line_mode_status_tube_2015-02-28_11:5')))

In [90]:
len(newdf)


Out[90]:
10

Test S3 parsing times


In [93]:
#ten files
%timeit parsing.parse_s3_files('tfl_api_line_mode_status_tube_2015-02-28_11:5')


1 loops, best of 3: 649 ms per loop

In [94]:
#one day
%timeit parsing.parse_s3_files('tfl_api_line_mode_status_tube_2015-02-28')


1 loops, best of 3: 2min 44s per loop

In [97]:
(2*60+44)*210/3600


Out[97]:
9.566666666666666

In [ ]:
list_of_files

In [ ]:


In [ ]: